Rey et al. 2020 data analysis

Running this notebook

To do.

Importing data

The PacMAN pipeline exports results as a phyloseq object which can be imported into R for analysis. This is how phyloseq objects are structured:

Read the phyloseq object and verify that we have a OTU table, a sample table, and a taxonomy table. A phylogenetic tree has not been calculated so that slot is empty.

physeq <- readRDS("../results_noblast_2samples/phyloseq_object.rds")

physeq
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 560 taxa and 2 samples ]
## sample_data() Sample Data:       [ 2 samples by 20 sample variables ]
## tax_table()   Taxonomy Table:    [ 560 taxa by 14 taxonomic ranks ]

While we can access these tables individually, the phyloseq package also has a function psmelt() to convert the phyloseq object into a single large data frame:

library(dplyr)

df <- psmelt(physeq) %>%
  mutate_if(is.character, list(~na_if(., ""))) %>%
  as_tibble()

df
## # A tibble: 1,120 × 37
##    OTU    Sample Abund…¹ eventID mater…² event…³ verba…⁴ local…⁵ verba…⁶ event…⁷
##    <chr>  <chr>    <int> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
##  1 asv.1  SAMN1…   16265 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-0…
##  2 asv.2  SAMN1…   10209 SAMN10… SAMN10… filter… 43.37 … Spain:… surface 2017-0…
##  3 asv.3  SAMN1…    7823 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-0…
##  4 asv.4  SAMN1…    7249 SAMN10… SAMN10… filter… 43.37 … Spain:… surface 2017-0…
##  5 asv.5  SAMN1…    6024 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-0…
##  6 asv.6  SAMN1…    5350 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-0…
##  7 asv.7  SAMN1…    5011 SAMN10… SAMN10… filter… 43.37 … Spain:… surface 2017-0…
##  8 asv.8  SAMN1…    4702 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-0…
##  9 asv.9  SAMN1…    4315 SAMN10… SAMN10… filter… 43.37 … Spain:… surface 2017-0…
## 10 asv.10 SAMN1…    4268 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-0…
## # … with 1,110 more rows, 27 more variables: occurrenceStatus <chr>,
## #   target_gene <chr>, subfragment <chr>, pcr_primer_forward <chr>,
## #   pcr_primer_reverse <chr>, pcr_primer_name_forw <chr>,
## #   pcr_primer_name_reverse <chr>, pcr_primer_reference <chr>,
## #   lib_layout <chr>, seq_meth <chr>, sop <chr>, votu_db <chr>, My_name <chr>,
## #   kingdom <chr>, class <chr>, order <chr>, genus <chr>, species <chr>,
## #   phylum <chr>, family <chr>, lastvalue <chr>, otu_seq_comp_appr <chr>, …

Exploratory data analysis

Abundance by sample type and phylum

library(tidyr)

stats <- df %>%
  group_by(eventRemarks, phylum) %>%
  summarize(abundance = sum(Abundance))

spread(stats, eventRemarks, abundance) %>%
  mutate(total = `settlement plates` + `filtered water`) %>%
  arrange(desc(total)) %>%
  knitr::kable()
phylum filtered water settlement plates total
NA 37582 34123 71705
Cnidaria 516 31943 32459
Arthropoda 10801 3412 14213
Annelida 0 4462 4462
Bryozoa 2 966 968
Mollusca 647 107 754
Chlorophyta 721 2 723
Bacillariophyta 644 15 659
Haptista 397 0 397
Rhodophyta 176 3 179
Basidiomycota 114 0 114
Prasinodermophyta 93 0 93
Chordata 2 79 81
Oomycota 56 0 56
Rotifera 9 23 32
Platyhelminthes 4 23 27
Porifera 12 0 12
Chaetognatha 5 0 5
Gastrotricha 0 4 4
Picozoa 4 0 4
Echinodermata 0 3 3
Entoprocta 0 2 2
Nematoda 0 2 2
library(ggplot2)

stats_simplified <- stats %>%
  mutate(
    phylum = case_when(phylum = abundance < 5000 ~ "Other", TRUE ~ phylum)
  ) %>%
  group_by(eventRemarks) %>%
  mutate(abundance = abundance / sum(abundance)) %>%
  group_by(eventRemarks, phylum) %>%
  summarize(abundance = sum(abundance))
  
ggplot() +
  geom_bar(data = stats_simplified, aes(y = eventRemarks, fill = phylum, x = abundance), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") + theme_minimal()

Abundance (reads) by sample and phylum

stats <- df %>%
  rename(sample = Sample) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(abundance = sum(Abundance))

stats_simplified <- stats %>%
  mutate(
    phylum = case_when(phylum = abundance < 2000 ~ "Other", TRUE ~ phylum)
  ) %>%
  group_by(sample) %>%
  mutate(relative_abundance = abundance / sum(abundance)) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(abundance = sum(abundance), relative_abundance = sum(relative_abundance))
  
ggplot() +
  geom_bar(data = stats_simplified, aes(y = sample, fill = phylum, x = relative_abundance), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

ggplot() +
  geom_bar(data = stats_simplified, aes(y = sample, fill = phylum, x = abundance), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

Diversity (ASVs) by sample and phylum

stats <- df %>%
  filter(Abundance > 0) %>%
  rename(sample = Sample) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(asvs = n())

stats_simplified <- stats %>%
  mutate(
    phylum = case_when(phylum = asvs < 16 ~ "Other", TRUE ~ phylum)
  ) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(asvs = sum(asvs))
  
ggplot() +
  geom_bar(data = stats_simplified, aes(y = sample, fill = phylum, x = asvs), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

Most abundant species

Let’s list the species with the highest relative abundance across all samples:

top_species <- df %>%
  filter(!is.na(species)) %>%
  group_by(species) %>%
  summarize(abundance = sum(Abundance)) %>%
  arrange(desc(abundance)) %>%
  head(20)

top_species %>% knitr::kable()
species abundance
Obelia dichotoma 18616
Bougainvillia muscus 12069
Dictyocha speculum 11547
Polydora neocaeca 3620
Amphibalanus eburneus 1978
Micromonas pusilla 664
Haliclystus tenuis 476
Polydora cornuta 317
Emiliania huxleyi 304
Ficopomatus enigmaticus 302
Balanus trigonus 289
Hydra canadensis 201
Austrominius modestus 165
Aeginopsis laurentii 163
Boccardiella hamata 163
Palmaria decipiens 129
Bittium reticulatum 120
Lepiota sp. MUSH020_07 111
Prasinoderma coloniale 93
Ascidiella aspersa 75

For the most abundant species, plot the abundance by sample:

stats <- df %>%
  filter(species %in% top_species$species) %>%
  group_by(species, eventRemarks, Sample) %>%
  summarize(abundance = sum(Abundance))

ggplot() +
  geom_jitter(data = stats, aes(y = species, x = abundance, color = eventRemarks), pch = 21, height = 0.1, width = 0) +
  scale_color_brewer(palette = "Set1") +
  theme_minimal()

Invasive species

In this section we will check our results against the World Register of Introduced Marine Species (WRiMS). The repository contains a CSV file containing the LSIDs for all species in WRiMS.

wrims_lsids <- read.csv("wrims_lsids.csv")

invasives <- df %>%
  filter(lsid %in% wrims_lsids$lsid)

invasives %>%
  group_by(phylum, species) %>%
  summarize(abundance = sum(Abundance)) %>%
  arrange(desc(abundance)) %>%
  rmarkdown::paged_table()

ASV accumulation curves

To do.

Alpha and beta diversity

To do.

Multidimensional scaling

ord <- ordinate(physeq = physeq, method = "PCoA", distance = "bray")

plot_ordination(physeq = physeq, ordination = ord, type = "samples", color = "eventRemarks", shape = "locality") +
  geom_point(aes(color = eventRemarks), alpha = 1, size = 4) +
  geom_text(aes(label = materialSampleID), alpha = 0.7, size = 3, vjust = 2) +
  stat_ellipse(aes(group = eventRemarks)) +
  scale_color_brewer(palette = "Set1") +
  theme_classic()
## NULL
# todo: Canonical analysis of principal coordinates